


import scrapy


class MySpider(scrapy.Spider):
    #用于区别Spider
    name = "test1"
    #允许访问的域
    allowed_domains = ['sj.qq.com']
    start_urls = ['http://sj.qq.com/myapp/']

    def parse(self,response):
        #爬取的地址
        
        yield scrapy.Request(url='http://sj.qq.com/myapp/',callback=self.parse_category)

    #爬取方法
    def parse_category(self, response):
        with open('ttt.txt','wb') as f:
            f.write(response.body)
        for url in response.xpath("//dl[@class='clearfix']/dd/a/@href").extract():
            #yield {
            #    "类别名" : url.xpath("./text()").extract_first(),
            #    "路径" : url.xpath("./@href").extract_first(),
            #}
            cat_url = response.urljoin(url)
            yield scrapy.Request(url=cat_url,callback=self.parse_app_list)

    def parse_app_list(self,response):
        for url in response.xpath("//ul[@class='app-list clearfix']/li/div/div/a[@class='name ofh']/@href").extract():
            #yield {
            #    'app_name':app_url.xpath('./text()').extract_first(),
            #    'app_url':app_url.xpath('./@href').extract_first(),
            #}
            app_url = response.urljoin(url)
            yield scrapy.Request(url=app_url,callback=self.parse_app_detail)

    def parse_app_detail(self,response):
        yield {
            'title':response.xpath("//div[@class='det-name-int']/text()").extract_first(),
            'rate':response.xpath("//div[@class='com-blue-star-num']/text()").extract_first(),
            'version':response.xpath("div[@class='det-othinfo-data']/text()").extract_first(),
            'description':response.xpath("//div[@class='det-app-data-info']/text()").extract_first(),
            }